/******************************************************************************
 * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

//! @file
//! cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of
//! samples data residing within device-accessible memory.

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cub/device/dispatch/dispatch_histogram.cuh>

#include <cuda/std/array>
#include <cuda/std/iterator>
#include <cuda/std/limits>

CUB_NAMESPACE_BEGIN

//! @rst
//! DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of
//! samples data residing within device-accessible memory.
//!
//! Overview
//! ++++++++++++++++++++++++++
//!
//! A `histogram <http://en.wikipedia.org/wiki/Histogram>`_ counts the number of observations that fall into each
//! of the disjoint categories (known as *bins*).
//!
//! Usage Considerations
//! ++++++++++++++++++++++++++
//!
//! @cdp_class{DeviceHistogram}
//!
//! @endrst
struct DeviceHistogram
{
  //! @name Evenly-segmented bin ranges
  //! @{

  //! @rst
  //! Computes an intensity histogram from a sequence of data samples using equal-width bins.
  //!
  //! - The number of histogram bins is (``num_levels - 1``)
  //! - All bins comprise the same width of sample values: ``(upper_level - lower_level) / (num_levels - 1)``.
  //! - If the common type of ``SampleT`` and ``LevelT`` is of integral type, the bin for a sample is
  //!   computed as ``(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)``, round
  //!   down to the nearest whole number. To protect against potential overflows, if the product
  //!   ``(upper_level - lower_level) * (num_levels - 1)`` exceeds the number representable by an
  //!   ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128
  //!   bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only
  //!   be returned if bin computation would overflow for 128-bit arithmetic.
  //! - The ranges ``[d_samples, d_samples + num_samples)`` and
  //!   ``[d_histogram, d_histogram + num_levels - 1)`` shall not overlap in any way.
  //! - ``cuda::std::common_type<LevelT, SampleT>`` must be valid, and both LevelT and SampleT must be valid
  //!   arithmetic types. The common type must be convertible to ``int`` and trivially copyable.
  //! - @devicestorage
  //!
  //! Snippet
  //! +++++++
  //!
  //! The code snippet below illustrates the computation of a six-bin histogram
  //! from a sequence of float samples
  //!
  //! .. code-block:: c++
  //!
  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
  //!
  //!    // Declare, allocate, and initialize device-accessible pointers for
  //!    // input samples and output histogram
  //!    int      num_samples;    // e.g., 10
  //!    float*   d_samples;      // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5]
  //!    int*     d_histogram;    // e.g., [ -, -, -, -, -, -]
  //!    int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
  //!    float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
  //!    float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
  //!    ...
  //!
  //!    // Determine temporary device storage requirements
  //!    void*    d_temp_storage = nullptr;
  //!    size_t   temp_storage_bytes = 0;
  //!    cub::DeviceHistogram::HistogramEven(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels,
  //!      lower_level, upper_level, num_samples);
  //!
  //!    // Allocate temporary storage
  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
  //!
  //!    // Compute histograms
  //!    cub::DeviceHistogram::HistogramEven(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels,
  //!      lower_level, upper_level, num_samples);
  //!
  //!    // d_histogram   <-- [1, 5, 0, 3, 0, 0];
  //!
  //! @endrst
  //!
  //! @tparam SampleIteratorT
  //!   **[inferred]** Random-access input iterator type for reading input samples @iterator
  //!
  //! @tparam CounterT
  //!   **[inferred]** Integer type for histogram bin counters
  //!
  //! @tparam LevelT
  //!   **[inferred]** Type for specifying boundaries (levels)
  //!
  //! @tparam OffsetT
  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
  //!   pointer differences, etc. @offset_size1
  //!
  //! @param[in] d_temp_storage
  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
  //!   required allocation size is written to `temp_storage_bytes` and no
  //!   work is done.
  //!
  //! @param[in,out] temp_storage_bytes
  //!   Reference to size in bytes of `d_temp_storage` allocation
  //!
  //! @param[in] d_samples
  //!   The pointer to the input sequence of data samples.
  //!
  //! @param[out] d_histogram
  //!   The pointer to the histogram counter output array of length
  //!   `num_levels - 1`.
  //!
  //! @param[in] num_levels
  //!   The number of boundaries (levels) for delineating histogram samples.
  //!   Implies that the number of bins is `num_levels - 1`.
  //!
  //! @param[in] lower_level
  //!   The lower sample value bound (inclusive) for the lowest histogram bin.
  //!
  //! @param[in] upper_level
  //!   The upper sample value bound (exclusive) for the highest histogram bin.
  //!
  //! @param[in] num_samples
  //!   The number of input samples (i.e., the length of `d_samples`)
  //!
  //! @param[in] stream
  //!   @rst
  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
  //!   @endrst
  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram,
    int num_levels,
    LevelT lower_level,
    LevelT upper_level,
    OffsetT num_samples,
    cudaStream_t stream = 0)
  {
    /// The sample value type of the input iterator
    using SampleT = cub::detail::it_value_t<SampleIteratorT>;
    return MultiHistogramEven<1, 1>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      ::cuda::std::array{d_histogram},
      ::cuda::std::array{num_levels},
      ::cuda::std::array{lower_level},
      ::cuda::std::array{upper_level},
      num_samples,
      static_cast<OffsetT>(1),
      sizeof(SampleT) * num_samples,
      stream);
  }

  //! @rst
  //! Computes an intensity histogram from a sequence of data samples using equal-width bins.
  //!
  //! - A two-dimensional *region of interest* within ``d_samples`` can be specified using
  //!   the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters.
  //! - The row stride must be a whole multiple of the sample data type
  //!   size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``.
  //! - The number of histogram bins is (``num_levels - 1``)
  //! - All bins comprise the same width of sample values: ``(upper_level - lower_level) / (num_levels - 1)``
  //! - If the common type of ``SampleT`` and ``LevelT`` is of integral type, the bin for a sample is
  //!   computed as ``(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)``, round
  //!   down to the nearest whole number. To protect against potential overflows, if the product
  //!   ``(upper_level - lower_level) * (num_levels - 1)`` exceeds the number representable by an
  //!   ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128
  //!   bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only
  //!   be returned if bin computation would overflow for 128-bit arithmetic.
  //! - For a given row ``r`` in ``[0, num_rows)``, let
  //!   ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`` and
  //!   ``row_end = row_begin + num_row_samples``. The ranges
  //!   ``[row_begin, row_end)`` and ``[d_histogram, d_histogram + num_levels - 1)``
  //!   shall not overlap in any way.
  //! - ``cuda::std::common_type<LevelT, SampleT>`` must be valid, and both LevelT
  //!   and SampleT must be valid arithmetic types. The common type must be
  //!   convertible to ``int`` and trivially copyable.
  //! - @devicestorage
  //!
  //! Snippet
  //! +++++++
  //!
  //! The code snippet below illustrates the computation of a six-bin histogram
  //! from a 2x5 region of interest within a flattened 2x7 array of float samples.
  //!
  //! .. code-block:: c++
  //!
  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
  //!
  //!    // Declare, allocate, and initialize device-accessible pointers for
  //!    // input samples and output histogram
  //!    int      num_row_samples;    // e.g., 5
  //!    int      num_rows;           // e.g., 2;
  //!    size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
  //!    float*   d_samples;          // e.g., [2.2, 6.1, 7.1, 2.9, 3.5,   -, -,
  //!                                 //        0.3, 2.9, 2.1, 6.1, 999.5, -, -]
  //!    int*     d_histogram;        // e.g., [ -, -, -, -, -, -]
  //!    int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
  //!    float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
  //!    float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
  //!    ...
  //!
  //!    // Determine temporary device storage requirements
  //!    void*    d_temp_storage  = nullptr;
  //!    size_t   temp_storage_bytes = 0;
  //!    cub::DeviceHistogram::HistogramEven(
  //!        d_temp_storage, temp_storage_bytes,
  //!        d_samples, d_histogram, num_levels, lower_level, upper_level,
  //!        num_row_samples, num_rows, row_stride_bytes);
  //!
  //!    // Allocate temporary storage
  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
  //!
  //!    // Compute histograms
  //!    cub::DeviceHistogram::HistogramEven(
  //!        d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
  //!        d_samples, d_histogram, num_levels, lower_level, upper_level,
  //!        num_row_samples, num_rows, row_stride_bytes);
  //!
  //!    // d_histogram   <-- [1, 5, 0, 3, 0, 0];
  //!
  //! @endrst
  //!
  //! @tparam SampleIteratorT
  //!   **[inferred]** Random-access input iterator type for reading
  //!   input samples. @iterator
  //!
  //! @tparam CounterT
  //!   **[inferred]** Integer type for histogram bin counters
  //!
  //! @tparam LevelT
  //!   **[inferred]** Type for specifying boundaries (levels)
  //!
  //! @tparam OffsetT
  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
  //!   pointer differences, etc. @offset_size1
  //!
  //! @param[in] d_temp_storage
  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
  //!   required allocation size is written to `temp_storage_bytes` and no
  //!   work is done.
  //!
  //! @param[in,out] temp_storage_bytes
  //!   Reference to size in bytes of `d_temp_storage` allocation
  //!
  //! @param[in] d_samples
  //!   The pointer to the input sequence of data samples.
  //!
  //! @param[out] d_histogram
  //!   The pointer to the histogram counter output array of
  //!   length `num_levels - 1`.
  //!
  //! @param[in] num_levels
  //!   The number of boundaries (levels) for delineating histogram samples.
  //!   Implies that the number of bins is `num_levels - 1`.
  //!
  //! @param[in] lower_level
  //!   The lower sample value bound (inclusive) for the lowest histogram bin.
  //!
  //! @param[in] upper_level
  //!   The upper sample value bound (exclusive) for the highest histogram bin.
  //!
  //! @param[in] num_row_samples
  //!   The number of data samples per row in the region of interest
  //!
  //! @param[in] num_rows
  //!   The number of rows in the region of interest
  //!
  //! @param[in] row_stride_bytes
  //!   The number of bytes between starts of consecutive rows in
  //!   the region of interest
  //!
  //! @param[in] stream
  //!   @rst
  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
  //!   @endrst
  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram,
    int num_levels,
    LevelT lower_level,
    LevelT upper_level,
    OffsetT num_row_samples,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream = 0)
  {
    return MultiHistogramEven<1, 1>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      ::cuda::std::array{d_histogram},
      ::cuda::std::array{num_levels},
      ::cuda::std::array{lower_level},
      ::cuda::std::array{upper_level},
      num_row_samples,
      num_rows,
      row_stride_bytes,
      stream);
  }

  //! @rst
  //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using
  //! equal-width bins.
  //!
  //! - The input is a sequence of *pixel* structures, where each pixel comprises
  //!   a record of ``NUM_CHANNELS`` consecutive data samples
  //!   (e.g., an *RGBA* pixel).
  //! - ``NUM_CHANNELS`` can be up to 4.
  //! - Of the ``NUM_CHANNELS`` specified, the function will only compute
  //!   histograms for the first ``NUM_ACTIVE_CHANNELS``
  //!   (e.g., only *RGB* histograms from *RGBA* pixel samples).
  //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
  //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width:
  //!   ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)``
  //! - If the common type of sample and level is of integral type, the bin for a sample is
  //!   computed as ``(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])``, round down
  //!   to the nearest whole number. To protect against potential overflows, if, for any channel ``i``, the product
  //!   ``(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)`` exceeds the number representable by an ``uint64_t``,
  //!   the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128 bits wide, bin computation
  //!   will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only be returned if bin
  //!   computation would overflow for 128-bit arithmetic.
  //! - For a given channel ``c`` in ``[0, NUM_ACTIVE_CHANNELS)``, the ranges
  //!   ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` and
  //!   ``[d_histogram[c], d_histogram[c] + num_levels[c] - 1)`` shall not overlap in any way.
  //! - ``cuda::std::common_type<LevelT, SampleT>`` must be valid, and both LevelT
  //!   and SampleT must be valid arithmetic types.
  //!   The common type must be convertible to ``int`` and trivially copyable.
  //! - @devicestorage
  //!
  //! Snippet
  //! +++++++
  //!
  //! The code snippet below illustrates the computation of three 256-bin *RGB* histograms
  //! from a quad-channel sequence of *RGBA* pixels (8 bits per channel per pixel)
  //!
  //! .. code-block:: c++
  //!
  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
  //!
  //!    // Declare, allocate, and initialize device-accessible pointers for
  //!    // input samples and output histograms
  //!    int              num_pixels;         // e.g., 5
  //!    unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
  //!                                         //        (0, 6, 7, 5), (3, 0, 2, 6)]
  //!    int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
  //!                                         //       each allocated with 256 integer counters
  //!    int              num_levels[3];      // e.g., {257, 257, 257};
  //!    unsigned int     lower_level[3];     // e.g., {0, 0, 0};
  //!    unsigned int     upper_level[3];     // e.g., {256, 256, 256};
  //!    ...
  //!
  //!    // Determine temporary device storage requirements
  //!    void*    d_temp_storage = nullptr;
  //!    size_t   temp_storage_bytes = 0;
  //!    cub::DeviceHistogram::MultiHistogramEven<4, 3>(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels,
  //!      lower_level, upper_level, num_pixels);
  //!
  //!    // Allocate temporary storage
  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
  //!
  //!    // Compute histograms
  //!    cub::DeviceHistogram::MultiHistogramEven<4, 3>(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels,
  //!      lower_level, upper_level, num_pixels);
  //!
  //!    // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
  //!    //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
  //!    //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
  //!
  //! @endrst
  //!
  //! @tparam NUM_CHANNELS
  //!   Number of channels interleaved in the input data (may be greater than
  //!   the number of channels being actively histogrammed)
  //!
  //! @tparam NUM_ACTIVE_CHANNELS
  //!   **[inferred]** Number of channels actively being histogrammed
  //!
  //! @tparam SampleIteratorT
  //!   **[inferred]** Random-access input iterator type for reading
  //!   input samples. @iterator
  //!
  //! @tparam CounterT
  //!   **[inferred]** Integer type for histogram bin counters
  //!
  //! @tparam LevelT
  //!   **[inferred]** Type for specifying boundaries (levels)
  //!
  //! @tparam OffsetT
  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
  //!   pointer differences, etc. @offset_size1
  //!
  //! @param[in] d_temp_storage
  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
  //!   required allocation size is written to `temp_storage_bytes` and no
  //!   work is done.
  //!
  //! @param[in,out] temp_storage_bytes
  //!   Reference to size in bytes of `d_temp_storage` allocation
  //!
  //! @param[in] d_samples
  //!   The pointer to the multi-channel input sequence of data samples.
  //!   The samples from different channels are assumed to be interleaved
  //!   (e.g., an array of 32-bit pixels where each pixel consists of four
  //!   *RGBA* 8-bit samples).
  //!
  //! @param[out] d_histogram
  //!   @rst
  //!   The pointers to the histogram counter output arrays, one for each active
  //!   channel. For channel\ :sub:`i`, the allocation length of
  //!   ``d_histogram[i]`` should be `num_levels[i] - 1``.
  //!   @endrst
  //!
  //! @param[in] num_levels
  //!   @rst
  //!   The number of boundaries (levels) for delineating histogram samples in each active channel.
  //!   Implies that the number of bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
  //!   @endrst
  //!
  //! @param[in] lower_level
  //!   The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
  //!
  //! @param[in] upper_level
  //!   The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
  //!
  //! @param[in] num_pixels
  //!   The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`)
  //!
  //! @param[in] stream
  //!   @rst
  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
  //!   @endrst
  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    ::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_histogram,
    ::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_levels,
    ::cuda::std::array<LevelT, NUM_ACTIVE_CHANNELS> lower_level,
    ::cuda::std::array<LevelT, NUM_ACTIVE_CHANNELS> upper_level,
    OffsetT num_pixels,
    cudaStream_t stream = 0)
  {
    /// The sample value type of the input iterator
    using SampleT = cub::detail::it_value_t<SampleIteratorT>;

    return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      lower_level,
      upper_level,
      num_pixels,
      static_cast<OffsetT>(1),
      sizeof(SampleT) * NUM_CHANNELS * num_pixels,
      stream);
  }

private:
  template <size_t N, typename T>
  _CCCL_HOST_DEVICE static auto to_array(T* ptr)
  {
    ::cuda::std::array<::cuda::std::remove_const_t<T>, N> a{};
    ::cuda::std::copy(ptr, ptr + N, a.begin());
    return a;
  }

public:
  //! Deprecate [Since 3.0]
  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CCCL_DEPRECATED_BECAUSE("Prefer the new overload taking cuda::std::arrays")
  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
    const int num_levels[NUM_ACTIVE_CHANNELS],
    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
    OffsetT num_pixels,
    cudaStream_t stream = 0)
  {
    /// The sample value type of the input iterator
    using SampleT = cub::detail::it_value_t<SampleIteratorT>;
    return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      to_array<NUM_ACTIVE_CHANNELS>(d_histogram),
      to_array<NUM_ACTIVE_CHANNELS>(num_levels),
      to_array<NUM_ACTIVE_CHANNELS>(lower_level),
      to_array<NUM_ACTIVE_CHANNELS>(upper_level),
      num_pixels,
      stream);
  }

  //! @rst
  //! Computes per-channel intensity histograms from a sequence of
  //! multi-channel "pixel" data samples using equal-width bins.
  //!
  //! - The input is a sequence of *pixel* structures, where each pixel
  //!   comprises a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel).
  //! - ``NUM_CHANNELS`` can be up to 4.
  //! - Of the ``NUM_CHANNELS`` specified, the function will only compute
  //!   histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., only *RGB*
  //!   histograms from *RGBA* pixel samples).
  //! - A two-dimensional *region of interest* within ``d_samples`` can be
  //!   specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters.
  //! - The row stride must be a whole multiple of the sample data type
  //!   size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``.
  //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
  //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width:
  //!   ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)``
  //! - If the common type of sample and level is of integral type, the bin for a sample is
  //!   computed as ``(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])``,
  //!   round down to the nearest whole number. To protect against potential overflows, if, for any channel ``i``,
  //!   the product ``(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)`` exceeds the number representable by
  //!   an ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned.
  //!   If the common type is 128 bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue``
  //!   will only be returned if bin computation would overflow for 128-bit arithmetic.
  //! - For a given row ``r`` in ``[0, num_rows)``, and sample ``s`` in
  //!   ``[0, num_row_pixels)``, let
  //!   ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)``,
  //!   ``sample_begin = row_begin + s * NUM_CHANNELS``, and
  //!   ``sample_end = sample_begin + NUM_ACTIVE_CHANNELS``. For a given channel ``c`` in
  //!   ``[0, NUM_ACTIVE_CHANNELS)``, the ranges
  //!   ``[sample_begin, sample_end)`` and
  //!   ``[d_histogram[c], d_histogram[c] + num_levels[c] - 1)`` shall not overlap in any way.
  //! - ``cuda::std::common_type<LevelT, SampleT>`` must be valid, and both LevelT
  //!   and SampleT must be valid arithmetic types. The common type must be
  //!   convertible to ``int`` and trivially copyable.
  //! - @devicestorage
  //!
  //! Snippet
  //! +++++++
  //!
  //! The code snippet below illustrates the computation of three 256-bin
  //! *RGB* histograms from a 2x3 region of interest of within a flattened 2x4
  //! array of quad-channel *RGBA* pixels (8 bits per channel per pixel).
  //!
  //! .. code-block:: c++
  //!
  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
  //!
  //!    // Declare, allocate, and initialize device-accessible pointers for input
  //!    // samples and output histograms
  //!    int              num_row_pixels;     // e.g., 3
  //!    int              num_rows;           // e.g., 2
  //!    size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
  //!    unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
  //!                                         //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
  //!    int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
  //!                                         //       each allocated with 256 integer counters
  //!    int              num_levels[3];      // e.g., {257, 257, 257};
  //!    unsigned int     lower_level[3];     // e.g., {0, 0, 0};
  //!    unsigned int     upper_level[3];     // e.g., {256, 256, 256};
  //!    ...
  //!
  //!    // Determine temporary device storage requirements
  //!    void*    d_temp_storage = nullptr;
  //!    size_t   temp_storage_bytes = 0;
  //!    cub::DeviceHistogram::MultiHistogramEven<4, 3>(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels, lower_level, upper_level,
  //!      num_row_pixels, num_rows, row_stride_bytes);
  //!
  //!    // Allocate temporary storage
  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
  //!
  //!    // Compute histograms
  //!    cub::DeviceHistogram::MultiHistogramEven<4, 3>(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels, lower_level, upper_level,
  //!      num_row_pixels, num_rows, row_stride_bytes);
  //!
  //!    // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
  //!    //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
  //!    //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
  //!
  //! @endrst
  //!
  //! @tparam NUM_CHANNELS
  //!   Number of channels interleaved in the input data (may be greater than
  //!   the number of channels being actively histogrammed)
  //!
  //! @tparam NUM_ACTIVE_CHANNELS
  //!   **[inferred]** Number of channels actively being histogrammed
  //!
  //! @tparam SampleIteratorT
  //!   **[inferred]** Random-access input iterator type for reading input
  //!   samples. @iterator
  //!
  //! @tparam CounterT
  //!   **[inferred]** Integer type for histogram bin counters
  //!
  //! @tparam LevelT
  //!   **[inferred]** Type for specifying boundaries (levels)
  //!
  //! @tparam OffsetT
  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
  //!   pointer differences, etc. @offset_size1
  //!
  //! @param[in] d_temp_storage
  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
  //!   required allocation size is written to `temp_storage_bytes` and no
  //!   work is done.
  //!
  //! @param[in,out] temp_storage_bytes
  //!   Reference to size in bytes of `d_temp_storage` allocation
  //!
  //! @param[in] d_samples
  //!   The pointer to the multi-channel input sequence of data samples. The
  //!   samples from different channels are assumed to be interleaved (e.g.,
  //!   an array of 32-bit pixels where each pixel consists of four
  //!   *RGBA* 8-bit samples).
  //!
  //! @param[out] d_histogram
  //!   @rst
  //!   The pointers to the histogram counter output arrays, one for each
  //!   active channel. For channel\ :sub:`i`, the allocation length
  //!   of ``d_histogram[i]`` should be ``num_levels[i] - 1``.
  //!   @endrst
  //!
  //! @param[in] num_levels
  //!   @rst
  //!   The number of boundaries (levels) for delineating histogram samples in each active channel.
  //!   Implies that the number of bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
  //!   @endrst
  //!
  //! @param[in] lower_level
  //!   The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
  //!
  //! @param[in] upper_level
  //!   The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
  //!
  //! @param[in] num_row_pixels
  //!   The number of multi-channel pixels per row in the region of interest
  //!
  //! @param[in] num_rows
  //!   The number of rows in the region of interest
  //!
  //! @param[in] row_stride_bytes
  //!   The number of bytes between starts of consecutive rows in the region of
  //!   interest
  //!
  //! @param[in] stream
  //!   @rst
  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
  //!   @endrst
  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    ::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_histogram,
    ::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_levels,
    ::cuda::std::array<LevelT, NUM_ACTIVE_CHANNELS> lower_level,
    ::cuda::std::array<LevelT, NUM_ACTIVE_CHANNELS> upper_level,
    OffsetT num_row_pixels,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream = 0)
  {
    _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramEven");

    /// The sample value type of the input iterator
    using SampleT = cub::detail::it_value_t<SampleIteratorT>;
    ::cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;

    if constexpr (sizeof(OffsetT) > sizeof(int))
    {
      if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
      {
        // Down-convert OffsetT data type
        return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
          d_temp_storage,
          temp_storage_bytes,
          d_samples,
          d_histogram,
          num_levels,
          lower_level,
          upper_level,
          (int) num_row_pixels,
          (int) num_rows,
          (int) (row_stride_bytes / sizeof(SampleT)),
          stream,
          is_byte_sample);
      }
    }

    return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      lower_level,
      upper_level,
      num_row_pixels,
      num_rows,
      (OffsetT) (row_stride_bytes / sizeof(SampleT)),
      stream,
      is_byte_sample);
  }

  //! Deprecate [Since 3.0]
  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CCCL_DEPRECATED_BECAUSE("Prefer the new overload taking cuda::std::arrays")
  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
    const int num_levels[NUM_ACTIVE_CHANNELS],
    const LevelT lower_level[NUM_ACTIVE_CHANNELS],
    const LevelT upper_level[NUM_ACTIVE_CHANNELS],
    OffsetT num_row_pixels,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream = 0)
  {
    return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      to_array<NUM_ACTIVE_CHANNELS>(d_histogram),
      to_array<NUM_ACTIVE_CHANNELS>(num_levels),
      to_array<NUM_ACTIVE_CHANNELS>(lower_level),
      to_array<NUM_ACTIVE_CHANNELS>(upper_level),
      num_row_pixels,
      num_rows,
      row_stride_bytes,
      stream);
  }

  //! @}  end member group
  //! @name Custom bin ranges
  //! @{

  //! @rst
  //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
  //!
  //! - The number of histogram bins is (``num_levels - 1``)
  //! - The value range for bin\ :sub:`i` is ``[level[i], level[i+1])``
  //! - The range ``[d_histogram, d_histogram + num_levels - 1)`` shall not
  //!   overlap ``[d_samples, d_samples + num_samples)`` nor
  //!   ``[d_levels, d_levels + num_levels)`` in any way. The ranges
  //!   ``[d_levels, d_levels + num_levels)`` and
  //!   ``[d_samples, d_samples + num_samples)`` may overlap.
  //! - @devicestorage
  //!
  //! Snippet
  //! +++++++
  //!
  //! The code snippet below illustrates the computation of an six-bin histogram
  //! from a sequence of float samples
  //!
  //! .. code-block:: c++
  //!
  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
  //!
  //!    // Declare, allocate, and initialize device-accessible pointers for input
  //!    // samples and output histogram
  //!    int      num_samples;    // e.g., 10
  //!    float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
  //!    int*     d_histogram;    // e.g., [ -, -, -, -, -, -]
  //!    int      num_levels      // e.g., 7 (seven level boundaries for six bins)
  //!    float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
  //!    ...
  //!
  //!    // Determine temporary device storage requirements
  //!    void*    d_temp_storage = nullptr;
  //!    size_t   temp_storage_bytes = 0;
  //!    cub::DeviceHistogram::HistogramRange(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels, d_levels, num_samples);
  //!
  //!    // Allocate temporary storage
  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
  //!
  //!    // Compute histograms
  //!    cub::DeviceHistogram::HistogramRange(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels, d_levels, num_samples);
  //!
  //!    // d_histogram   <-- [1, 5, 0, 3, 0, 0];
  //!
  //! @endrst
  //!
  //! @tparam SampleIteratorT
  //!   **[inferred]** Random-access input iterator type for reading
  //!   input samples. @iterator
  //!
  //! @tparam CounterT
  //!   **[inferred]** Integer type for histogram bin counters
  //!
  //! @tparam LevelT
  //!   **[inferred]** Type for specifying boundaries (levels)
  //!
  //! @tparam OffsetT
  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
  //!   pointer differences, etc. @offset_size1
  //!
  //! @param[in] d_temp_storage
  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
  //!   required allocation size is written to `temp_storage_bytes` and no work
  //!   is done.
  //!
  //! @param[in,out] temp_storage_bytes
  //!   Reference to size in bytes of `d_temp_storage` allocation
  //!
  //! @param[in] d_samples
  //!   The pointer to the input sequence of data samples.
  //!
  //! @param[out] d_histogram
  //!   The pointer to the histogram counter output array of length
  //!   `num_levels - 1`.
  //!
  //! @param[in] num_levels
  //!   The number of boundaries (levels) for delineating histogram samples.
  //!   Implies that the number of bins is `num_levels - 1`.
  //!
  //! @param[in] d_levels
  //!   The pointer to the array of boundaries (levels). Bin ranges are defined
  //!   by consecutive boundary pairings: lower sample value boundaries are
  //!   inclusive and upper sample value boundaries are exclusive.
  //!
  //! @param[in] num_samples
  //!   The number of data samples per row in the region of interest
  //!
  //! @param[in] stream
  //!   @rst
  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
  //!   @endrst
  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram,
    int num_levels,
    const LevelT* d_levels,
    OffsetT num_samples,
    cudaStream_t stream = 0)
  {
    /// The sample value type of the input iterator
    using SampleT = cub::detail::it_value_t<SampleIteratorT>;
    return MultiHistogramRange<1, 1>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      ::cuda::std::array{d_histogram},
      ::cuda::std::array{num_levels},
      ::cuda::std::array{d_levels},
      num_samples,
      (OffsetT) 1,
      (size_t) (sizeof(SampleT) * num_samples),
      stream);
  }

  //! @rst
  //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
  //!
  //! - A two-dimensional *region of interest* within ``d_samples`` can be
  //!   specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters.
  //! - The row stride must be a whole multiple of the sample data type
  //!   size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``.
  //! - The number of histogram bins is (``num_levels - 1``)
  //! - The value range for bin\ :sub:`i` is ``[level[i], level[i+1])``
  //! - For a given row ``r`` in ``[0, num_rows)``, let
  //!   ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`` and
  //!   ``row_end = row_begin + num_row_samples``. The range
  //!   ``[d_histogram, d_histogram + num_levels - 1)`` shall not overlap
  //!   ``[row_begin, row_end)`` nor ``[d_levels, d_levels + num_levels)``.
  //!   The ranges ``[d_levels, d_levels + num_levels)`` and ``[row_begin, row_end)`` may overlap.
  //! - @devicestorage
  //!
  //! Snippet
  //! +++++++
  //!
  //! The code snippet below illustrates the computation of a six-bin histogram
  //! from a 2x5 region of interest within a flattened 2x7 array of float samples.
  //!
  //! .. code-block:: c++
  //!
  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
  //!
  //!    // Declare, allocate, and initialize device-accessible pointers for input samples and
  //!    // output histogram
  //!    int      num_row_samples;    // e.g., 5
  //!    int      num_rows;           // e.g., 2;
  //!    int      row_stride_bytes;   // e.g., 7 * sizeof(float)
  //!    float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
  //!                                 //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
  //!    int*     d_histogram;        // e.g., [ -, -, -, -, -, -]
  //!    int      num_levels          // e.g., 7 (seven level boundaries for six bins)
  //!    float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
  //!    ...
  //!
  //!    // Determine temporary device storage requirements
  //!    void*    d_temp_storage = nullptr;
  //!    size_t   temp_storage_bytes = 0;
  //!    cub::DeviceHistogram::HistogramRange(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels, d_levels,
  //!      num_row_samples, num_rows, row_stride_bytes);
  //!
  //!    // Allocate temporary storage
  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
  //!
  //!    // Compute histograms
  //!    cub::DeviceHistogram::HistogramRange(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels, d_levels,
  //!      num_row_samples, num_rows, row_stride_bytes);
  //!
  //!    // d_histogram   <-- [1, 5, 0, 3, 0, 0];
  //!
  //! @endrst
  //!
  //! @tparam SampleIteratorT
  //!   **[inferred]** Random-access input iterator type for reading
  //!   input samples. @iterator
  //!
  //! @tparam CounterT
  //!   **[inferred]** Integer type for histogram bin counters
  //!
  //! @tparam LevelT
  //!   **[inferred]** Type for specifying boundaries (levels)
  //!
  //! @tparam OffsetT
  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
  //!   pointer differences, etc. @offset_size1
  //!
  //! @param[in] d_temp_storage
  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
  //!   required allocation size is written to `temp_storage_bytes` and no
  //!   work is done.
  //!
  //! @param[in,out] temp_storage_bytes
  //!   Reference to size in bytes of `d_temp_storage` allocation
  //!
  //! @param[in] d_samples
  //!   The pointer to the input sequence of data samples.
  //!
  //! @param[out] d_histogram
  //!   The pointer to the histogram counter output array of length
  //!   `num_levels - 1`.
  //!
  //! @param[in] num_levels
  //!   The number of boundaries (levels) for delineating histogram samples.
  //!   Implies that the number of bins is `num_levels - 1`.
  //!
  //! @param[in] d_levels
  //!   The pointer to the array of boundaries (levels). Bin ranges are defined
  //!   by consecutive boundary pairings: lower sample value boundaries are
  //!   inclusive and upper sample value boundaries are exclusive.
  //!
  //! @param[in] num_row_samples
  //!   The number of data samples per row in the region of interest
  //!
  //! @param[in] num_rows
  //!   The number of rows in the region of interest
  //!
  //! @param[in] row_stride_bytes
  //!   The number of bytes between starts of consecutive rows in the region
  //!   of interest
  //!
  //! @param[in] stream
  //!   @rst
  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
  //!   @endrst
  template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram,
    int num_levels,
    const LevelT* d_levels,
    OffsetT num_row_samples,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream = 0)
  {
    return MultiHistogramRange<1, 1>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      ::cuda::std::array{d_histogram},
      ::cuda::std::array{num_levels},
      ::cuda::std::array{d_levels},
      num_row_samples,
      num_rows,
      row_stride_bytes,
      stream);
  }

  //! @rst
  //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples
  //! using the specified bin boundary levels.
  //!
  //! - The input is a sequence of *pixel* structures, where each pixel
  //!   comprises a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel).
  //! - ``NUM_CHANNELS`` can be up to 4.
  //! - Of the ``NUM_CHANNELS`` specified, the function will only compute
  //!   histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., *RGB* histograms from *RGBA* pixel samples).
  //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
  //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width:
  //!   ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)``
  //! - For given channels ``c1`` and ``c2`` in ``[0, NUM_ACTIVE_CHANNELS)``, the
  //!   range ``[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)`` shall
  //!   not overlap ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` nor
  //!   ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` in any way.
  //!   The ranges ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` and
  //!   ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` may overlap.
  //! - @devicestorage
  //!
  //! Snippet
  //! +++++++
  //!
  //! The code snippet below illustrates the computation of three 4-bin *RGB*
  //! histograms from a quad-channel sequence of *RGBA* pixels
  //! (8 bits per channel per pixel)
  //!
  //! .. code-block:: c++
  //!
  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
  //!
  //!    // Declare, allocate, and initialize device-accessible pointers for
  //!    // input samples and output histograms
  //!    int            num_pixels;       // e.g., 5
  //!    unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
  //!                                     //        (0, 6, 7, 5),(3, 0, 2, 6)]
  //!    unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
  //!    int            num_levels[3];    // e.g., {5, 5, 5};
  //!    unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
  //!                                     //         [0, 2, 4, 6, 8],
  //!                                     //         [0, 2, 4, 6, 8] ];
  //!    ...
  //!
  //!    // Determine temporary device storage requirements
  //!    void*    d_temp_storage = nullptr;
  //!    size_t   temp_storage_bytes = 0;
  //!    cub::DeviceHistogram::MultiHistogramRange<4, 3>(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels, d_levels, num_pixels);
  //!
  //!    // Allocate temporary storage
  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
  //!
  //!    // Compute histograms
  //!    cub::DeviceHistogram::MultiHistogramRange<4, 3>(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels, d_levels, num_pixels);
  //!
  //!    // d_histogram   <-- [ [1, 3, 0, 1],
  //!    //                     [3, 0, 0, 2],
  //!    //                     [0, 2, 0, 3] ]
  //!
  //! @endrst
  //!
  //! @tparam NUM_CHANNELS
  //!   Number of channels interleaved in the input data (may be greater than
  //!   the number of channels being actively histogrammed)
  //!
  //! @tparam NUM_ACTIVE_CHANNELS
  //!   **[inferred]** Number of channels actively being histogrammed
  //!
  //! @tparam SampleIteratorT
  //!   **[inferred]** Random-access input iterator type for reading
  //!   input samples. @iterator
  //!
  //! @tparam CounterT
  //!   **[inferred]** Integer type for histogram bin counters
  //!
  //! @tparam LevelT
  //!   **[inferred]** Type for specifying boundaries (levels)
  //!
  //! @tparam OffsetT
  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
  //!   pointer differences, etc. @offset_size1
  //!
  //! @param[in] d_temp_storage
  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
  //!   required allocation size is written to `temp_storage_bytes` and no
  //!   work is done.
  //!
  //! @param[in,out] temp_storage_bytes
  //!   Reference to size in bytes of `d_temp_storage` allocation
  //!
  //! @param[in] d_samples
  //!   The pointer to the multi-channel input sequence of data samples.
  //!   The samples from different channels are assumed to be interleaved (e.g.,
  //!   an array of 32-bit pixels where each pixel consists of four *RGBA*
  //!   8-bit samples).
  //!
  //! @param[out] d_histogram
  //!   @rst
  //!   The pointers to the histogram counter output arrays, one for each active
  //!   channel. For channel\ :sub:`i`, the allocation length of
  //!   ``d_histogram[i]`` should be ``num_levels[i] - 1``.
  //!   @endrst
  //!
  //! @param[in] num_levels
  //!   @rst
  //!   The number of boundaries (levels) for delineating histogram samples in
  //!   each active channel. Implies that the number of bins for
  //!   channel\ :sub:`i` is ``num_levels[i] - 1``.
  //!   @endrst
  //!
  //! @param[in] d_levels
  //!   The pointers to the arrays of boundaries (levels), one for each active
  //!   channel. Bin ranges are defined by consecutive boundary pairings: lower
  //!   sample value boundaries are inclusive and upper sample value boundaries
  //!   are exclusive.
  //!
  //! @param[in] num_pixels
  //!   The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`)
  //!
  //! @param[in] stream
  //!   @rst
  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
  //!   @endrst
  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    ::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_histogram,
    ::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_levels,
    ::cuda::std::array<const LevelT*, NUM_ACTIVE_CHANNELS> d_levels,
    OffsetT num_pixels,
    cudaStream_t stream = 0)
  {
    /// The sample value type of the input iterator
    using SampleT = cub::detail::it_value_t<SampleIteratorT>;

    return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      d_levels,
      num_pixels,
      (OffsetT) 1,
      (size_t) (sizeof(SampleT) * NUM_CHANNELS * num_pixels),
      stream);
  }

  //! Deprecate [Since 3.0]
  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CCCL_DEPRECATED_BECAUSE("Prefer the new overload taking cuda::std::arrays")
  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
    const int num_levels[NUM_ACTIVE_CHANNELS],
    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
    OffsetT num_pixels,
    cudaStream_t stream = 0)
  {
    return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      to_array<NUM_ACTIVE_CHANNELS>(d_histogram),
      to_array<NUM_ACTIVE_CHANNELS>(num_levels),
      to_array<NUM_ACTIVE_CHANNELS>(d_levels),
      num_pixels,
      stream);
  }

  //! @rst
  //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using
  //! the specified bin boundary levels.
  //!
  //! - The input is a sequence of *pixel* structures, where each pixel comprises
  //!   a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel).
  //! - ``NUM_CHANNELS`` can be up to 4.
  //! - Of the ``NUM_CHANNELS`` specified, the function will only compute
  //!   histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., *RGB* histograms from *RGBA* pixel samples).
  //! - A two-dimensional *region of interest* within ``d_samples`` can be
  //!   specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters.
  //! - The row stride must be a whole multiple of the sample data type
  //!   size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``.
  //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``.
  //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width:
  //!   ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)``
  //! - For a given row ``r`` in ``[0, num_rows)``, and sample ``s`` in ``[0, num_row_pixels)``, let
  //!   ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)``,
  //!   ``sample_begin = row_begin + s * NUM_CHANNELS``, and
  //!   ``sample_end = sample_begin + NUM_ACTIVE_CHANNELS``. For given channels
  //!   ``c1`` and ``c2`` in ``[0, NUM_ACTIVE_CHANNELS)``, the range
  //!   ``[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)`` shall not overlap
  //!   ``[sample_begin, sample_end)`` nor
  //!   ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` in any way. The ranges
  //!   ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` and
  //!   ``[sample_begin, sample_end)`` may overlap.
  //! - @devicestorage
  //!
  //! Snippet
  //! +++++++
  //!
  //! The code snippet below illustrates the computation of three 4-bin *RGB*
  //! histograms from a 2x3 region of interest of within a flattened 2x4 array
  //! of quad-channel *RGBA* pixels (8 bits per channel per pixel).
  //!
  //!
  //! .. code-block:: c++
  //!
  //!    #include <cub/cub.cuh> // or equivalently <cub/device/device_histogram.cuh>
  //!
  //!    // Declare, allocate, and initialize device-accessible pointers for input
  //!    // samples and output histograms
  //!    int              num_row_pixels;     // e.g., 3
  //!    int              num_rows;           // e.g., 2
  //!    size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
  //!    unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
  //!                                         //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
  //!    int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
  //!    int              num_levels[3];      // e.g., {5, 5, 5};
  //!    unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
  //!                                         //         [0, 2, 4, 6, 8],
  //!                                         //         [0, 2, 4, 6, 8] ];
  //!    ...
  //!
  //!    // Determine temporary device storage requirements
  //!    void*    d_temp_storage = nullptr;
  //!    size_t   temp_storage_bytes = 0;
  //!    cub::DeviceHistogram::MultiHistogramRange<4, 3>(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels, d_levels,
  //!      num_row_pixels, num_rows, row_stride_bytes);
  //!
  //!    // Allocate temporary storage
  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
  //!
  //!    // Compute histograms
  //!    cub::DeviceHistogram::MultiHistogramRange<4, 3>(
  //!      d_temp_storage, temp_storage_bytes,
  //!      d_samples, d_histogram, num_levels,
  //!      d_levels, num_row_pixels, num_rows, row_stride_bytes);
  //!
  //!    // d_histogram   <-- [ [2, 3, 0, 1],
  //!    //                     [3, 0, 0, 2],
  //!    //                     [1, 2, 0, 3] ]
  //!
  //! @endrst
  //!
  //! @tparam NUM_CHANNELS
  //!   Number of channels interleaved in the input data (may be greater than
  //!   the number of channels being actively histogrammed)
  //!
  //! @tparam NUM_ACTIVE_CHANNELS
  //!   **[inferred]** Number of channels actively being histogrammed
  //!
  //! @tparam SampleIteratorT
  //!   **[inferred]** Random-access input iterator type for reading input
  //!   samples. @iterator
  //!
  //! @tparam CounterT
  //!   **[inferred]** Integer type for histogram bin counters
  //!
  //! @tparam LevelT
  //!   **[inferred]** Type for specifying boundaries (levels)
  //!
  //! @tparam OffsetT
  //!   **[inferred]** Signed integer type for sequence offsets, list lengths,
  //!   pointer differences, etc. @offset_size1
  //!
  //! @param[in] d_temp_storage
  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
  //!
  //! @param[in,out] temp_storage_bytes
  //!   Reference to size in bytes of `d_temp_storage` allocation
  //!
  //! @param[in] d_samples
  //!   The pointer to the multi-channel input sequence of data samples. The
  //!   samples from different channels are assumed to be interleaved (e.g., an
  //!   array of 32-bit pixels where each pixel consists of four
  //!   *RGBA* 8-bit samples).
  //!
  //! @param[out] d_histogram
  //!   @rst
  //!   The pointers to the histogram counter output arrays, one for each active
  //!   channel. For channel\ :sub:`i`, the allocation length of
  //!   ``d_histogram[i]`` should be ``num_levels[i] - 1``.
  //!   @endrst
  //!
  //! @param[in] num_levels
  //!   @rst
  //!   The number of boundaries (levels) for delineating histogram samples in
  //!   each active channel. Implies that the number of bins for
  //!   channel\ :sub:`i` is ``num_levels[i] - 1``.
  //!   @endrst
  //!
  //! @param[in] d_levels
  //!   The pointers to the arrays of boundaries (levels), one for each active
  //!   channel. Bin ranges are defined by consecutive boundary pairings: lower
  //!   sample value boundaries are inclusive and upper sample value boundaries
  //!   are exclusive.
  //!
  //! @param[in] num_row_pixels
  //!   The number of multi-channel pixels per row in the region of interest
  //!
  //! @param[in] num_rows
  //!   The number of rows in the region of interest
  //!
  //! @param[in] row_stride_bytes
  //!   The number of bytes between starts of consecutive rows in the
  //!   region of interest
  //!
  //! @param[in] stream
  //!   @rst
  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
  //!   @endrst
  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    ::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_histogram,
    ::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_levels,
    ::cuda::std::array<const LevelT*, NUM_ACTIVE_CHANNELS> d_levels,
    OffsetT num_row_pixels,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream = 0)
  {
    _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramRange");

    /// The sample value type of the input iterator
    using SampleT = cub::detail::it_value_t<SampleIteratorT>;
    ::cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;

    if constexpr (sizeof(OffsetT) > sizeof(int))
    {
      if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
      {
        // Down-convert OffsetT data type
        return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
          d_temp_storage,
          temp_storage_bytes,
          d_samples,
          d_histogram,
          num_levels,
          d_levels,
          (int) num_row_pixels,
          (int) num_rows,
          (int) (row_stride_bytes / sizeof(SampleT)),
          stream,
          is_byte_sample);
      }
    }

    return DispatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      d_histogram,
      num_levels,
      d_levels,
      num_row_pixels,
      num_rows,
      (OffsetT) (row_stride_bytes / sizeof(SampleT)),
      stream,
      is_byte_sample);
  }

  //! Deprecate [Since 3.0]
  template <int NUM_CHANNELS,
            int NUM_ACTIVE_CHANNELS,
            typename SampleIteratorT,
            typename CounterT,
            typename LevelT,
            typename OffsetT>
  CCCL_DEPRECATED_BECAUSE("Prefer the new overload taking cuda::std::arrays")
  CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    SampleIteratorT d_samples,
    CounterT* d_histogram[NUM_ACTIVE_CHANNELS],
    const int num_levels[NUM_ACTIVE_CHANNELS],
    const LevelT* const d_levels[NUM_ACTIVE_CHANNELS],
    OffsetT num_row_pixels,
    OffsetT num_rows,
    size_t row_stride_bytes,
    cudaStream_t stream = 0)
  {
    return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
      d_temp_storage,
      temp_storage_bytes,
      d_samples,
      to_array<NUM_ACTIVE_CHANNELS>(d_histogram),
      to_array<NUM_ACTIVE_CHANNELS>(num_levels),
      to_array<NUM_ACTIVE_CHANNELS>(d_levels),
      num_row_pixels,
      num_rows,
      row_stride_bytes,
      stream);
  }

  //@}  end member group
};

CUB_NAMESPACE_END
